{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Explore Data In Redshift\n",
    "In this notebook, we will visualize the 2014 and 2015 data that we previously loaded into Redshift.\n",
    "\n",
    "_Note:  This notebook requires that you are running this SageMaker Notebook Instance in a VPC with access to the Redshift cluster._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "import boto3\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "redshift = boto3.client('redshift')\n",
    "secretsmanager = boto3.client('secretsmanager')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Set Redshift Connection Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "redshift_schema = 'redshift'\n",
    "redshift_cluster_identifier = 'dsoaws'\n",
    "redshift_host = 'dsoaws'\n",
    "redshift_database = 'dsoaws'\n",
    "redshift_port = '5439'\n",
    "redshift_table_2015 = 'amazon_reviews_tsv_2015'\n",
    "redshift_table_2014 = 'amazon_reviews_tsv_2014'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the Redshift Secrets from Secrets Manager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')\n",
    "cred = json.loads(secret['SecretString'])\n",
    "\n",
    "redshift_username = cred[0]['username']\n",
    "redshift_pw = cred[1]['password']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Please Wait for Cluster Status  `Available`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "redshift = boto3.client('redshift')\n",
    "\n",
    "response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)\n",
    "cluster_status = response['Clusters'][0]['ClusterStatus']\n",
    "print(cluster_status)\n",
    "\n",
    "while cluster_status != 'available':\n",
    "    time.sleep(10)\n",
    "    response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)\n",
    "    cluster_status = response['Clusters'][0]['ClusterStatus']\n",
    "    print(cluster_status)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get Redshift Endpoint Address"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']\n",
    "\n",
    "print(redshift_endpoint_address)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create the Redshift Connection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import awswrangler as wr\n",
    "\n",
    "con_redshift = wr.data_api.redshift.connect(\n",
    "    cluster_id=redshift_cluster_identifier,\n",
    "    database=redshift_database,\n",
    "    db_user=redshift_username,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare For Visualizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Set Seaborn parameters in advance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set_style = 'seaborn-whitegrid'\n",
    "\n",
    "sns.set(rc={\"font.style\":\"normal\",\n",
    "            \"axes.facecolor\":\"white\",\n",
    "            'grid.color': '.8',\n",
    "            'grid.linestyle': '-',\n",
    "            \"figure.facecolor\":\"white\",\n",
    "            \"figure.titlesize\":20,\n",
    "            \"text.color\":\"black\",\n",
    "            \"xtick.color\":\"black\",\n",
    "            \"ytick.color\":\"black\",\n",
    "            \"axes.labelcolor\":\"black\",\n",
    "            \"axes.grid\":True,\n",
    "            'axes.labelsize':10,\n",
    "            'figure.figsize':(10.0, 10.0),\n",
    "            'xtick.labelsize':10,\n",
    "            'font.size':10,\n",
    "            'ytick.labelsize':10})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom code to display values on bars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_values_barplot(axs, space):\n",
    "    def _show_on_plot(ax):\n",
    "        for p in ax.patches:\n",
    "            _x = p.get_x() + p.get_width() + float(space)\n",
    "            _y = p.get_y() + p.get_height()\n",
    "            value = round(float(p.get_width()),2)\n",
    "            ax.text(_x, _y, value, ha=\"left\")\n",
    "\n",
    "    if isinstance(axs, np.ndarray):\n",
    "        for idx, ax in np.ndenumerate(axs):\n",
    "            _show_on_plot(ax)\n",
    "    else:\n",
    "        _show_on_plot(axs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Which product categories had the most reviews in 2015?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT\n",
    "    year,\n",
    "    product_category,\n",
    "    COUNT(star_rating) AS count_star_rating   \n",
    "FROM\n",
    "    {}.{}  \n",
    "GROUP BY\n",
    "    product_category,\n",
    "    year  \n",
    "ORDER BY\n",
    "    count_star_rating DESC,\n",
    "    year DESC\n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store counts\n",
    "count_ratings = df['count_star_rating']\n",
    "\n",
    "# Store number of categories\n",
    "num_categories = df['product_category'].count()\n",
    "print(num_categories)\n",
    "\n",
    "# Store max ratings\n",
    "max_ratings = df['count_star_rating'].max()\n",
    "print(max_ratings)\n",
    "\n",
    "# Store min ratings\n",
    "min_ratings = df['count_star_rating'].min()\n",
    "print(min_ratings)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Seaborn barplot\n",
    "barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)\n",
    "\n",
    "# Scale if needed\n",
    "if num_categories < 10:\n",
    "        sns.set(rc={'figure.figsize':(10.0, 5.0)})\n",
    "    \n",
    "# Set title\n",
    "barplot.set_title(\"Number of Ratings Per Product Category (2015)\", fontsize=\"20\")\n",
    "\n",
    "# Set x-axis ticks to match scale \n",
    "if max_ratings <= 200000:\n",
    "    plt.xticks([50000, 100000, 150000, 200000], ['50K', '100K', '150K', '200K'])\n",
    "    plt.xlim(0, 200000)\n",
    "elif max_ratings <= 5000000:\n",
    "    plt.xticks([1000000, 2000000, 3000000, 4000000, 5000000], ['1m', '2m', '3m', '4m', '5m'])\n",
    "    plt.xlim(0, 5000000)\n",
    "else:\n",
    "    plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])\n",
    "    plt.xlim(0, 20000000)\n",
    "\n",
    "plt.xlabel(\"Number of Ratings\", fontsize=\"15\")\n",
    "plt.ylabel(\"Product Category\", fontsize=\"15\")\n",
    "\n",
    "plt.tight_layout()\n",
    "\n",
    "# Export plot if needed\n",
    "# plt.savefig('redshift_2015_ratings_per_category.png', dpi=300)\n",
    "\n",
    "# Show the barplot\n",
    "plt.show(barplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-09.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Which products have the most helpful reviews in 2015?\n",
    "How long are those reviews?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT\n",
    "    product_title,\n",
    "    helpful_votes,\n",
    "    LENGTH(review_body) AS review_body_length,\n",
    "    SUBSTRING(review_body, 1, 100) AS review_body_substring \n",
    "FROM\n",
    "    {}.{} \n",
    "ORDER BY\n",
    "    helpful_votes DESC LIMIT 10 \n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Results for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following result:\n",
    "\n",
    "<img src=\"img/most_helpful_2015.png\"  width=\"90%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. What is the breakdown of star ratings (1-5) per product category in 2015?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT\n",
    "    product_category,\n",
    "    star_rating,\n",
    "    COUNT(DISTINCT review_id) AS count_reviews \n",
    "FROM\n",
    "    {}.{} \n",
    "GROUP BY\n",
    "    product_category,\n",
    "    star_rating \n",
    "ORDER BY\n",
    "    product_category ASC,\n",
    "    star_rating DESC,\n",
    "    count_reviews\n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Which product categories are the highest rated by average rating?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement\n",
    "statement = \"\"\"\n",
    "SELECT\n",
    "    product_category,\n",
    "    COUNT(star_rating) AS count_stars,\n",
    "    AVG(star_rating::FLOAT) AS avg_star_rating\n",
    "FROM\n",
    "    {}.{}  \n",
    "GROUP BY\n",
    "    product_category  \n",
    "ORDER BY\n",
    "    avg_star_rating DESC\n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_average_rating = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_average_rating"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stacked percentage horizontal bar plot showing proportion of star ratings per product category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create grouped DataFrames by category and by star rating\n",
    "grouped_category = df.groupby('product_category')\n",
    "grouped_star = df.groupby('star_rating')\n",
    "\n",
    "# Create sum of ratings per star rating\n",
    "df_sum = df.groupby(['star_rating']).sum()\n",
    "\n",
    "# Calculate total number of star ratings\n",
    "total = df_sum['count_reviews'].sum()\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dictionary of product categories and array of star rating distribution per category\n",
    "distribution = {}\n",
    "count_reviews_per_star = []\n",
    "i=0\n",
    "    \n",
    "for category, ratings in grouped_category:\n",
    "    count_reviews_per_star = []\n",
    "    for star in ratings['star_rating']:\n",
    "        count_reviews_per_star.append(ratings.at[i, 'count_reviews'])\n",
    "        i=i+1;\n",
    "    distribution[category] = count_reviews_per_star\n",
    "\n",
    "# Check if distribution has been created succesfully\n",
    "print(distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if distribution keys are set correctly to product categories\n",
    "print(distribution.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if star rating distributions are set correctly\n",
    "print(distribution.items())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build array per star across all categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sort distribution by highest average rating per category\n",
    "sorted_distribution = {}\n",
    "\n",
    "df_average_rating.iloc[:,0]\n",
    "for index, value in df_average_rating.iloc[:,0].items():\n",
    "    sorted_distribution[value] = distribution[value]\n",
    "\n",
    "sorted_distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build array per star across all categories\n",
    "star1 = []\n",
    "star2 = []\n",
    "star3 = []\n",
    "star4 = []\n",
    "star5 = []\n",
    "\n",
    "for k in sorted_distribution.keys():\n",
    "    stars = sorted_distribution.get(k)\n",
    "    star5.append(stars[0])\n",
    "    star4.append(stars[1])\n",
    "    star3.append(stars[2])\n",
    "    star2.append(stars[3])\n",
    "    star1.append(stars[4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = sorted_distribution.keys()\n",
    "\n",
    "total = np.array(star1) + np.array(star2) + np.array(star3) + np.array(star4) + np.array(star5)\n",
    "\n",
    "proportion_star1 = np.true_divide(star1, total) * 100\n",
    "proportion_star2 = np.true_divide(star2, total) * 100\n",
    "proportion_star3 = np.true_divide(star3, total) * 100\n",
    "proportion_star4 = np.true_divide(star4, total) * 100\n",
    "proportion_star5 = np.true_divide(star5, total) * 100\n",
    "\n",
    "# Add colors\n",
    "colors = ['red', 'purple','blue','orange','green']\n",
    "\n",
    "# The position of the bars on the x-axis\n",
    "r = range(len(categories))\n",
    "barHeight = 1\n",
    "\n",
    "# Plot bars\n",
    "if num_categories > 10:\n",
    "    plt.figure(figsize=(10,10))\n",
    "else: \n",
    "    plt.figure(figsize=(10,5))\n",
    "\n",
    "ax5 = plt.barh(r, proportion_star5, color=colors[4], edgecolor='white', height=barHeight, label='5-Star Ratings')\n",
    "ax4 = plt.barh(r, proportion_star4, left=proportion_star5, color=colors[3], edgecolor='white', height=barHeight, label='4-Star Ratings')\n",
    "ax3 = plt.barh(r, proportion_star3, left=proportion_star5+proportion_star4, color=colors[2], edgecolor='white', height=barHeight, label='3-Star Ratings')\n",
    "ax2 = plt.barh(r, proportion_star2, left=proportion_star5+proportion_star4+proportion_star3, color=colors[1], edgecolor='white', height=barHeight, label='2-Star Ratings')\n",
    "ax1 = plt.barh(r, proportion_star1, left=proportion_star5+proportion_star4+proportion_star3+proportion_star2, color=colors[0], edgecolor='white', height=barHeight, label=\"1-Star Ratings\")\n",
    "\n",
    "plt.title(\"Distribution of Reviews Per Rating Per Category (2015)\",fontsize='16')\n",
    "plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
    "plt.yticks(r, categories, fontweight='regular')\n",
    "\n",
    "plt.xlabel(\"% Breakdown of Star Ratings\", fontsize='14')\n",
    "plt.gca().invert_yaxis()\n",
    "plt.tight_layout()\n",
    "\n",
    "# plt.savefig('redshift_2015_proportion_star_per_category.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-10.png\"  width=\"70%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. How did the star ratings change during 2015?\n",
    "Is there a drop-off point for certain product categories throughout the year?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT\n",
    "    CAST(DATE_PART('month', TO_DATE(review_date, 'YYYY-MM-DD')) AS INTEGER) AS month,\n",
    "    AVG(star_rating::FLOAT) AS avg_rating  \n",
    "FROM\n",
    "    {}.{} \n",
    "GROUP BY\n",
    "    month\n",
    "ORDER BY\n",
    "    month\n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.gcf()\n",
    "fig.set_size_inches(12,5)\n",
    "\n",
    "fig.suptitle('Average Star Rating Throughout 2015 (Across Subset Of Product Categories)')\n",
    "\n",
    "ax = plt.gca()\n",
    "\n",
    "ax.locator_params(integer=True)\n",
    "ax.set_xticks(df['month'].unique())\n",
    "\n",
    "df.plot(kind='line',x='month',y='avg_rating', color='red', ax=ax)\n",
    "\n",
    "plt.xlabel('Months')\n",
    "plt.ylabel('Average Star Rating')\n",
    "\n",
    "# fig.savefig('redshift_2015_average_rating.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/c4-11.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Is there a drop-off point for certain product categories throughout the year?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "statement = \"\"\"\n",
    "SELECT\n",
    "    product_category,\n",
    "    CAST(DATE_PART('month', TO_DATE(review_date, 'YYYY-MM-DD')) AS INTEGER) AS month,\n",
    "    AVG(star_rating::FLOAT) AS avg_rating  \n",
    "FROM\n",
    "    {}.{} \n",
    "GROUP BY\n",
    "    product_category, month\n",
    "ORDER BY\n",
    "    product_category, month\n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_categories(df):\n",
    "    df_categories = df['product_category'].unique()\n",
    "    for category in df_categories:\n",
    "        # print(category)\n",
    "        df_plot = df.loc[df['product_category'] == category]\n",
    "        df_plot.plot(kind='line',x='month',y='avg_rating', c=np.random.rand(3,), ax=ax, label=category)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = plt.gcf()\n",
    "fig.set_size_inches(12,5)\n",
    "\n",
    "fig.suptitle('Average Star Rating Over Time Per Category In 2015')\n",
    "           \n",
    "ax = plt.gca()\n",
    "\n",
    "ax.locator_params(integer=True)\n",
    "ax.set_xticks(df['month'].unique())\n",
    "\n",
    "plot_categories(df)\n",
    "\n",
    "plt.xlabel('Month')\n",
    "plt.ylabel('Average Star Rating')\n",
    "plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=2, ncol=2, mode=\"expand\", borderaxespad=0)\n",
    "\n",
    "# fig.savefig('redshift_2015_average_rating_category.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "![](img/redshift_2015_average_rating_category.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. Which customers are abusing the review system in 2015 by repeatedly reviewing the same product more than once?  \n",
    "What was their average star rating for each product?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SQL statement \n",
    "statement = \"\"\"\n",
    "SELECT\n",
    "    customer_id,\n",
    "    product_category,\n",
    "    product_title,\n",
    "    ROUND(AVG(star_rating::FLOAT), 4) AS avg_star_rating,\n",
    "    COUNT(*) AS review_count  \n",
    "FROM\n",
    "    redshift.amazon_reviews_tsv_2015 \n",
    "GROUP BY\n",
    "    customer_id,\n",
    "    product_category,\n",
    "    product_title  \n",
    "HAVING\n",
    "    COUNT(*) > 1  \n",
    "ORDER BY\n",
    "    review_count DESC LIMIT 5 \n",
    "\"\"\".format(redshift_schema, redshift_table_2015)\n",
    "\n",
    "print(statement)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = wr.data_api.redshift.read_sql_query(\n",
    "    sql=statement,\n",
    "    con=con_redshift,\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualization for All Product Categories (150+ Million Reviews)\n",
    "If you ran this same query across all product categories (150+ million reviews), you would see the following visualization:\n",
    "\n",
    "<img src=\"img/redshift-abuse-all.png\"  width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "\n",
    "try {\n",
    "    Jupyter.notebook.save_checkpoint();\n",
    "    Jupyter.notebook.session.delete();\n",
    "}\n",
    "catch(err) {\n",
    "    // NoOp\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}